# -*- coding: utf-8 -*-
"""
Code Introduction:
    This code 
Version History:
    Created: Mon Oct  5 14:03:46 2020
    Current: 

@author: Xing Guo (guoxing.econ@gmail.com)

"""

#%% Setup Working Directory

import os
## Windows System Path
FolderList = [xx+"Dropbox (Bank of Canada)\\Research Projects\\OHANK\\Empirics\\Analysis_Distribution\\" \
              for xx in ["D:\\","B:\\","/mnt/b/"]]
for Folder in FolderList:
    if os.path.exists(Folder):
        os.chdir(Folder)    

## Output Folder
OutputFolder = 'TableGraph/'
if not os.path.exists(OutputFolder):
    os.makedirs(OutputFolder)
# End of Section: Setup Working Directory
###############################################################################

#%% Import Moduels

## System Tools

import numpy as np
from collections import OrderedDict
import time
## I/O Tools
import _pickle as pickle
## Data Process Tools
import pandas as pd
# import modin.pandas as pd
import datetime
## Graphs
import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf as figpdf
## Statistical Tools
import statsmodels.formula.api as sm
from statsmodels.tsa.api import VAR
from scipy.stats import mstats
from scipy.interpolate import interp1d
import statsmodels.api as SMAPI
from statsmodels.tsa.tsatools import detrend as DeTrend
from statsmodels.tsa.filters.hp_filter import hpfilter as HPfilter
from statsmodels.tsa.filters.bk_filter import bkfilter as BKfilter
## Database API
from fredapi import Fred
## Numerical API
from scipy.interpolate import interp1d
## Regular Expression API
import re
import multiprocessing as mp

idx = pd.IndexSlice

# from Toolbox_Graph import Graph_PDF, GraphSetup, MultiLine_SinglePlot, NBER_RecessionBar, IRF_SinglePlot
# End of Section: Import Moduels
###############################################################################

#%% Cleaning

### Generate the Sample

## Data
SFS_2016 = pd.read_stata('../Data/SFS/sfs2016en.dta')

## Variables
VarInfo = pd.read_excel('../Data/SFS/VarDict.xlsx')
VarDict = {VarInfo.loc[ii,'Code'].lower(): VarInfo.loc[ii,'Name'] for ii in VarInfo.index}

## Sample
DS = SFS_2016.rename(columns=VarDict)[VarDict.values()]

## Other Variables
DS['NetWorth_Liquid'] = DS['Asset_Deposit']+DS['Asset_Bonds'] \
                        +DS['Asset_MutualFund']+DS['Asset_Stock'] \
                        -DS['Debt_CreditCard']
#%% Functions to Compute the Distributional Features
def QW_ApproxPoints(qq,qw=[]):
    TempInd = ~pd.isnull(qq)
    if len(qw)==0:
        qw = np.ones(len(qq))/len(qq)
    else:
        qw = qw[TempInd].values
        qw = qw/sum(qw)
    
    qq = qq[TempInd].values
    
    OrderInd = np.argsort(qq)
    qq = qq[OrderInd]
    qw = qw[OrderInd]
    return qq, qw

def QW_Quantile(xx,qq,qw=[]):
    qq, qw = QW_ApproxPoints(qq,qw)
    cdf = np.cumsum(qw)
    ff = interp1d(cdf,qq,kind='cubic')
    return ff(xx)

def Dist_byQuantile(DS,qq_name,qw_name='',xx=[0.25,0.50,0.75]):
    TempDS = DS[np.isfinite(DS[qq_name])]
    qq = TempDS[qq_name]
    if qw_name=='':
        qw = qq.copy()
        qw = 1
    else:
        qw = TempDS[qw_name]
        
    quant = QW_Quantile(xx,qq,qw=qw)
    qq, qw = QW_ApproxPoints(qq,qw)
    qq_qw = qq * qw
    cdf = np.cumsum(qw)
    
    AccQQ_up = np.zeros(len(xx))
    AccQW_up = np.zeros(len(xx))
    Total = sum(qq_qw)
    for ii in range(len(xx)):
        TempInd = cdf>=xx[ii]
        AccQQ_up[ii] = sum(qq_qw[TempInd])/Total
        AccQW_up[ii] = sum(qw[TempInd])
    Dist = pd.concat([pd.Series(AccQW_up,name='AccQW_up'), \
                      pd.Series(AccQQ_up,name='AccQQ_up'), \
                      pd.Series(quant,name='QQ'),pd.Series(xx,name='Quant')],axis=1)
    Dist['AccQQ_down'] = 1-Dist['AccQQ_up']
    Dist['AccQW_down'] = 1-Dist['AccQW_up']
    return Dist

def Dist_byHistogram(DS,qq_name,qw_name,bins=50):
    TempDS = DS[[qq_name,qw_name]].dropna()
    if type(bins)==int:
        Unit = 1e3
        x1 = np.floor(TempDS[qq_name].min()/Unit)*Unit
        x2 = np.ceil(TempDS[qq_name].max()/Unit)*Unit
        bins = int((x2-x1)/Unit)+1
        cutoff = np.linspace(x1,x2,num=bins)
        # cutoff = np.geomspace(1,x2+(1-x1),num=bins-1)
        # cutoff = cutoff - (1-x1)
        # cutoff = np.geomspace(1,x2+(1-x1),num=bins-1)
        num = bins+1
    elif type(bins)==list: 
        cutoff = bins
        num = len(bins)+1
    
    cutoffbins = [-np.inf,*cutoff,np.inf]
    TempDS['Group'] = pd.cut(TempDS[qq_name],bins=cutoffbins,labels=range(num))
    TempDS['QQ'] = TempDS[qq_name]
    TempDS['QW'] = TempDS[qw_name]/TempDS[qw_name].sum()
    TempDS['QQ_QW'] = TempDS['QQ']*TempDS['QW']
    Hist = TempDS.groupby('Group')[['QW','QQ_QW']].sum()
    Hist['QQ_left'] = pd.Series(cutoffbins[0:-1])
    Hist['QQ_right'] = pd.Series(cutoffbins[1:])
    Hist['QQ_mid'] = (Hist['QQ_left']+Hist['QQ_right'])/2
    Hist.loc[0,'QQ_mid'] = Hist.loc[0,'QQ_right']
    Hist.loc[num-1,'QQ_mid'] = Hist.loc[num-1,'QQ_left']
    
    return Hist

def Dist_GiniCoef(DS,qq_name,qw_name):
    TempDS = DS.loc[np.isfinite(DS[qq_name]) & np.isfinite(DS[qw_name]),:]
    TempDS = TempDS[[qq_name,qw_name]].dropna().sort_values(by=qq_name).reset_index(drop=True)
    TempDS['QW'] = TempDS[qw_name]/TempDS[qw_name].sum()
    TempDS['QQ_QW'] = TempDS[qq_name]*TempDS['QW']
    TempDS['QQ_QW'] = TempDS['QQ_QW']/TempDS['QQ_QW'].sum()
    TempDS['AccProb'] = TempDS['QW'].cumsum()
    TempDS['AccQQ'] = TempDS['QQ_QW'].cumsum()
    TempDS['AccProb_'] = (TempDS['AccProb']+TempDS['AccProb'].shift(1))/2
    TempDS.loc[0,'AccProb_'] = TempDS.loc[0,'AccProb']/2
    TempDS['AccQQ_'] = (TempDS['AccQQ']+TempDS['AccQQ'].shift(1))/2
    TempDS.loc[0,'AccQQ_'] = TempDS.loc[0,'AccQQ']/2
    Gini = (TempDS['AccQQ_']*TempDS['QW']).sum()/ \
           (TempDS['AccProb_']*TempDS['QW']).sum()
    return Gini

def QW_Mean(DS,qq_name,qw_name=''):
    Temp = DS.loc[np.isfinite(DS[qq_name]),:]
    if qw_name=='':
        return Temp[qq_name].mean()
    else:
        Temp = Temp.loc[np.isfinite(Temp[qw_name]),:]
        Temp[qw_name] = Temp[qw_name]/Temp[qw_name].sum()
        return (Temp[qq_name]*Temp[qw_name]).sum()
#%% Statistics

# Var_Income = 'Income_Market'
# DS[Var_Income] = DS[Var_Income].astype('float')

# DS['Ratio_NetWorthLiquid_Income_1'] = DS['NetWorth_Liquid']/DS['Income_Market']
# DS['Ratio_NetWorthLiquid_Income_2'] = DS['NetWorth_Liquid']/QW_Mean(DS,'Income_Market','StatWeight')